package cn.hchaojie.camp.day12;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.junit.Test;

public class DemoCrawler {
	private static final String URL_PATTERN = "((ht|f)tps?):\\/\\/[\\w\\-]+(\\.[\\w\\-]+)+([\\w\\-"
			+ "\\.,@?^=%&:\\/~\\+#]*[\\w\\-\\@?^=%&\\/~\\+#])?";
	
	@Test
	public void test1() {
		parseLine("http://baidu.com abb");
	}
	
	public static void main(String[] args) {
		// 如何下载一个网页
		try {
			URL url = new URL("http://www.jikedaohang.com");
			InputStream webStream = url.openStream();
			
			BufferedReader reader = new BufferedReader(
					new InputStreamReader(webStream, "utf-8"));
			
			String line = null;
			while ((line = reader.readLine()) != null) {
				// System.out.println(line);
				parseLine(line);
			}

		} catch (IOException e) {
			e.printStackTrace();
		}
	}
	
	// 把每一行的url提取处来
	public static void parseLine(String line) {
		// pattern，使用一个正则表达式，去字符串里面匹配子串
		Pattern pattern = Pattern.compile(URL_PATTERN);
		
		// 每次匹配到的字符串
		Matcher matcher = pattern.matcher(line);
		
		while (matcher.find()) {	// 找到一个匹配的字符串
			System.out.println(matcher.group());	// 取出匹配字符串
		}
	}
}
